{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d124d22-de73-436b-86cd-9b162b469be8",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install langchain_community\n",
    "%pip install langchain_experimental \n",
    "%pip install langchain-openai \n",
    "%pip install langchainhub \n",
    "%pip install chromadb \n",
    "%pip install langchain\n",
    "%pip install beautifulsoup4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f884314f-870c-4bfb-b6c1-a5b4801ec172",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from langchain_community.document_loaders import WebBaseLoader\n",
    "import bs4\n",
    "import openai\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "from langchain import hub\n",
    "from langchain_core.output_parsers import StrOutputParser\n",
    "from langchain_core.runnables import RunnablePassthrough\n",
    "import chromadb\n",
    "from langchain_community.vectorstores import Chroma\n",
    "from langchain_experimental.text_splitter import SemanticChunker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "721241b4-32ab-476a-a5ac-9feab48459e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# OpenAI Setup\n",
    "os.environ['OPENAI_API_KEY'] = ''\n",
    "openai.api_key = os.environ['OPENAI_API_KEY']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3ad428a-3eb6-40ec-a1a5-62565ead1e5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### INDEXING ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98ccda2c-0f4c-41c5-804d-2227cdf35aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Documents\n",
    "loader = WebBaseLoader(\n",
    "    web_paths=(\"https://kbourne.github.io/chapter1.html\",), \n",
    "    bs_kwargs=dict(\n",
    "        parse_only=bs4.SoupStrainer(\n",
    "            class_=(\"post-content\", \"post-title\", \"post-header\")\n",
    "        )\n",
    "    ),\n",
    ")\n",
    "docs = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "927a4c65-aa05-486c-8295-2f99673e7c20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split\n",
    "text_splitter = SemanticChunker(OpenAIEmbeddings())\n",
    "splits = text_splitter.split_documents(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b13568c-d633-464d-8c43-0d55f34cc8c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Embed\n",
    "vectorstore = Chroma.from_documents(documents=splits, \n",
    "                                    embedding=OpenAIEmbeddings())\n",
    "\n",
    "retriever = vectorstore.as_retriever()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ce8df01-925b-45b5-8fb8-17b5c40c581f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### RETRIEVAL and GENERATION ####"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb47c817-b5ac-4d90-84ee-4cd209e52a80",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prompt\n",
    "prompt = hub.pull(\"jclemens24/rag-prompt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8975479-b3e3-481d-ad7b-08b4eb3faaef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Post-processing\n",
    "def format_docs(docs):\n",
    "    return \"\\n\\n\".join(doc.page_content for doc in docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deb6d70c-42ef-4bda-9607-48f02c941280",
   "metadata": {},
   "outputs": [],
   "source": [
    "# LLM\n",
    "llm = ChatOpenAI(model_name=\"gpt-4o-mini\", temperature=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd9db713-f705-4b65-800e-2c4e3d0e4ef4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chain it all together with LangChain\n",
    "rag_chain = (\n",
    "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
    "    | prompt\n",
    "    | llm\n",
    "    | StrOutputParser()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b30177a-f9ab-45e4-812d-33b0f97325bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Question - run the chain\n",
    "rag_chain.invoke(\"What are the advantages of using RAG?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
